# load packages, installing if missing
if (!require(librarian)){
install.packages("librarian")
library(librarian)
}
librarian::shelf(
tidyverse, dismo, DT, here, htmltools, leaflet, mapview, raster, rgbif,
rgdal, rJava, sdmpredictors, sf, spocc, geojsonio, GGally, mgcv, maptools,
caret, # m: modeling framework
pdp, # X: partial dependence plots
ranger, # m: random forest modeling
rpart, # m: recursive partition modeling
rpart.plot, # m: recursive partition plotting
rsample, # d: split train/test data
skimr, # d: skim summarize data table
vip # X: variable importance
)
select <- dplyr::select # overwrite raster::select
options(readr.show_col_type = FALSE, scipen = 999)
# set random seed for reproducibility
set.seed(42)
# graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F, recursive = TRUE)
# logical, redo full calculations within if statements or not
redo <- FALSE
Species: Red-tailed Hawk (Buteo jamaicensis) Total observations after creating bounding box around habitat area and removing duplicate geometries.
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
if (!file.exists(obs_geo) | redo){
# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
query = 'Buteo jamaicensis',
from = 'gbif',
has_coords = T,
limit = 10000))
# extract data frame from result
df <- res$gbif$data[[1]]
readr::write_csv(df, obs_csv)
# convert to points of observation from lon/lat columns in data frame
obs <- df %>%
# limit observations to bounding box around north and central america
filter(between(longitude, -167.593385, -51.171266),
between(latitude, 5.645215, 71.374349)) %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) %>% # save space (joinable from obs_csv)
distinct(geometry, .keep_all = TRUE) # remove duplicate geometries
sf::write_sf(obs, obs_geo, delete_dsn = T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 9376
# show points on map
mapview::mapview(obs, map.types = "Esri.WorldStreetMap")
Question 1: There were a total of 7,004,451 observations in GBIF for the Red-tailed hawk (Buteo jamaicensis) as of 2022-01-24.
Question 2: There was one point from the original dataset that was in the ocean near Europe. To fix this issues, I created a bounding box around the parts of North and Central America and used this bbox to exclude any locations outside of the Red-tailed Hawks habitat.
dir_env <- file.path(dir_data, "env")
# set a default data directory
options(sdmpredictors_datadir = dir_env)
# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)
# show table of datasets
env_datasets %>%
select(dataset_code, description, citation) %>%
DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", # altitude
"WC_bio1", # annual mean temperature
"WC_bio2", # mean diurnal temperature range
"ER_tri", # terrain roughness
"WC_bio12") # annual precipitation
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc = 2)
Question 3: The environmental layers that I choose include: altitude, annual mean temperature, mean diurnal temperature range, terrain roughness, and annual precipitation. In the literature I found, other studies used: maximum and minimum temperatures, precipitation, solar radiation, wind speed, and cloud cover as abiotic factors. I felt that the first two variables from the literature were covered by annual mean temperature, mean diurnal temperature range, and annual precipitation. The other variables were not present in the available layers, so I did not include them.
obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
if (!file.exists(obs_hull_geo) | redo){
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(
list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>%
raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite = T)
}
env_stack <- stack(env_stack_grd)
# show map
plot(env_stack, nc=2)
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
if (!file.exists(absence_geo) | redo){
# get raster count of observations
r_obs <- rasterize(
sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
# create mask for
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
# generate random points inside mask
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>%
as_tibble() %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
write_sf(absence, absence_geo, delete_dsn = T)
}
absence <- read_sf(absence_geo)
# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") +
mapview(absence, col.regions = "gray")
if (!file.exists(pts_env_csv) | redo){
# combine presence and absence into single set of labeled points
pts <- rbind(
obs %>%
mutate(present = 1) %>%
select(present, key),
absence %>% mutate(present = 0,
key = NA)) %>%
mutate(ID = 1:n()) %>%
relocate(ID)
write_sf(pts, pts_geo, delete_dsn = TRUE)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df = TRUE) %>%
tibble() %>%
# join present and geometry columns to raster value results for points
left_join(pts %>% select(ID, present),
by = "ID") %>%
relocate(present, .after = ID) %>%
# extract lon, lat as single columns
mutate(lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]) %>%
select(-geometry)
write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
pts_env %>%
# show first 10 presence, last 10 absence
slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>%
DT::datatable(rownames = FALSE,
options = list(dom = "t",
pageLength = 20))
pts_env %>%
select(-ID) %>%
mutate(present = factor(present)) %>%
pivot_longer(-present) %>%
ggplot() +
geom_density(aes(x = value, fill = present)) +
scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_bw() +
facet_wrap(~name, scales = "free") +
theme(legend.position = c(1, 0),
legend.justification = c(1, 0))
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present), alpha = 0.5))
# setup model data
model_data <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop rows with NA values
nrow(model_data)
## [1] 18689
# fit a linear model
mdl_lm <- lm(present ~ ., data = model_data) # . means all other columns (X)
summary(mdl_lm)
##
## Call:
## lm(formula = present ~ ., data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.11564 -0.41687 0.05479 0.41187 1.23357
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.53038786 0.07882251 6.729 0.0000000000176 ***
## WC_alt -0.00011443 0.00001244 -9.196 < 0.0000000000000002 ***
## WC_bio1 0.02994807 0.00220208 13.600 < 0.0000000000000002 ***
## WC_bio2 -0.04723098 0.00215837 -21.883 < 0.0000000000000002 ***
## ER_tri -0.00042966 0.00012862 -3.340 0.000838 ***
## WC_bio12 -0.00027108 0.00001039 -26.100 < 0.0000000000000002 ***
## lon -0.00176759 0.00030868 -5.726 0.0000000104210 ***
## lat 0.00825249 0.00154596 5.338 0.0000000950219 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4516 on 18681 degrees of freedom
## Multiple R-squared: 0.1846, Adjusted R-squared: 0.1843
## F-statistic: 604.2 on 7 and 18681 DF, p-value: < 0.00000000000000022
y_predict_lm <- predict(mdl_lm, model_data, type = "response")
y_true <- pts_env$present
range(y_predict_lm)
## [1] -0.8936088 1.1156414
range(y_true)
## [1] 0 1
# fit a generalized linear model with a binomial logit link function
mdl_glm <- glm(present ~ .,
family = binomial(link = "logit"),
data = model_data)
summary(mdl_glm)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = model_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.50204 -1.00911 -0.03428 0.99099 2.77627
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.06305959 0.40734341 0.155 0.87697
## WC_alt -0.00060899 0.00006371 -9.559 < 0.0000000000000002 ***
## WC_bio1 0.15429205 0.01131031 13.642 < 0.0000000000000002 ***
## WC_bio2 -0.23356351 0.01147043 -20.362 < 0.0000000000000002 ***
## ER_tri -0.00200862 0.00066290 -3.030 0.00245 **
## WC_bio12 -0.00141805 0.00005651 -25.096 < 0.0000000000000002 ***
## lon -0.00686829 0.00155792 -4.409 0.00001040251 ***
## lat 0.04786281 0.00804013 5.953 0.00000000263 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25908 on 18688 degrees of freedom
## Residual deviance: 22035 on 18681 degrees of freedom
## AIC: 22051
##
## Number of Fisher Scoring iterations: 4
y_predict_glm <- predict(mdl_glm, model_data, type = "response")
range(y_predict_glm)
## [1] 0.0005872994 0.9562871347
termplot(mdl_glm, partial.resid = TRUE, se = TRUE, main = F)
# fit a generalized additive model with smooth predictors
mdl_gam <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) +
s(WC_bio2) + s(ER_tri) + s(WC_bio12) + s(lon) + s(lat),
family = binomial, data = model_data)
summary(mdl_gam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(WC_bio12) +
## s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.8549 0.2163 -3.953 0.0000772 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 8.309 8.840 337.4 <0.0000000000000002 ***
## s(WC_bio1) 8.972 8.999 1251.2 <0.0000000000000002 ***
## s(WC_bio2) 8.300 8.792 615.6 <0.0000000000000002 ***
## s(ER_tri) 8.821 8.969 121.9 <0.0000000000000002 ***
## s(WC_bio12) 5.944 6.680 618.5 <0.0000000000000002 ***
## s(lon) 8.817 8.984 598.5 <0.0000000000000002 ***
## s(lat) 8.819 8.990 295.4 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.52 Deviance explained = 45.7%
## UBRE = -0.24159 Scale est. = 1 n = 18689
plot(mdl_gam, scale = 0)
Question 4: There are two environment variables that seem to contribute most towards presence. For altitude, below about 250 ft of altitude there is a higher chance of presence with a high confidence range. Where there is a diurnal temperature range (bio2) between about 7 and 12 degree there is a higher chance of species presence. Additionally, between about -120 and -100 longitudinal degrees there is a higher chance of species presence.
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds") # create new file path
# show version of maxent
if (!interactive())
maxent()
## This is MaxEnt version 3.4.3
# plot environmental rasters
plot(env_stack, nc = 2)
# get presence-only observation points (maxent extracts raster values for you)
obs_sp <- sf::as_Spatial(obs) # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds) | redo){
mdl <- maxent(env_stack, obs_sp)
readr::write_rds(mdl, mdl_maxent_rds)
}
mdl_maxent <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl_maxent)
# plot term plots
response(mdl_maxent)
y_predict_maxent <- predict(env_stack, mdl_maxent)
plot(y_predict_maxent, main = 'Maxent, raw prediction')
data(wrld_simpl, package = "maptools")
plot(wrld_simpl, add = TRUE, border='dark grey')
Question 4: There are four environment variables that seem to contribute most towards presence for the maxent model. The first two, altitude and bio2 were similar to the GAM results. For altitude, below about 500 ft of altitude there is a high chance of presence. Where there is a diurnal temperature range (bio2) between about 5 and 15 degree there is a high chance of species presence. The other two variables (mean annual temperature and annual precipitation) had values that seems to predict species presence that did not show up in the GAM results. When the mean annual temperature (bio1) is between 10 and 20 degrees there is a high chance of species presence. And when the annual precipitation is below 1,000 there is a high chance of species presence.
decision_tree_data <- pts_env %>%
select(-ID) %>% # not used as a predictor x
mutate(present = factor(present)) %>% # categorical response
na.omit() # drop rows with NA
skim(decision_tree_data)
| Name | decision_tree_data |
| Number of rows | 18689 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| present | 0 | 1 | FALSE | 2 | 0: 9345, 1: 9344 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| WC_alt | 0 | 1 | 558.07 | 638.36 | -70.00 | 101.00 | 279.00 | 808.00 | 3738.00 | ▇▂▁▁▁ |
| WC_bio1 | 0 | 1 | 12.26 | 6.59 | -5.60 | 7.70 | 12.30 | 17.30 | 29.10 | ▂▅▇▇▂ |
| WC_bio2 | 0 | 1 | 12.72 | 2.52 | 4.00 | 11.10 | 12.40 | 14.50 | 21.20 | ▁▃▇▃▁ |
| ER_tri | 0 | 1 | 25.95 | 34.72 | 0.00 | 5.33 | 11.46 | 32.11 | 277.85 | ▇▁▁▁▁ |
| WC_bio12 | 0 | 1 | 787.55 | 452.51 | 52.00 | 438.00 | 753.00 | 1059.00 | 5648.00 | ▇▂▁▁▁ |
| lon | 0 | 1 | -100.69 | 16.70 | -135.29 | -117.08 | -100.87 | -86.96 | -59.96 | ▅▇▇▇▂ |
| lat | 0 | 1 | 38.06 | 8.95 | 9.12 | 33.09 | 38.55 | 43.79 | 60.62 | ▁▂▇▆▂ |
# create training set with 80% of full data
d_split <- rsample::initial_split(decision_tree_data,
prop = 0.8,
strata = "present")
d_train <- rsample::training(d_split)
# show number of rows present is 0 vs 1 (all data)
table(decision_tree_data$present)
##
## 0 1
## 9345 9344
# show number of rows present is 0 vs 1 (training data)
table(d_train$present)
##
## 0 1
## 7476 7475
# run decision stump model
mdl_dt1 <- rpart(present ~ .,
data = d_train,
control = list(cp = 0,
minbucket = 5,
maxdepth = 1))
mdl_dt1
## n= 14951
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 14951 7475 0 (0.50003344 0.49996656)
## 2) WC_bio1< 6.35 2824 179 0 (0.93661473 0.06338527) *
## 3) WC_bio1>=6.35 12127 4831 1 (0.39836728 0.60163272) *
# plot tree
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl_dt1)
# decision tree with defaults
mdl_dt2 <- rpart(present ~ ., data = d_train)
mdl_dt2
## n= 14951
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 14951 7475 0 (0.50003344 0.49996656)
## 2) WC_bio1< 6.35 2824 179 0 (0.93661473 0.06338527) *
## 3) WC_bio1>=6.35 12127 4831 1 (0.39836728 0.60163272)
## 6) lon>=-116.1268 8570 4214 0 (0.50828471 0.49171529)
## 12) WC_bio2>=12.55 4196 1400 0 (0.66634890 0.33365110) *
## 13) WC_bio2< 12.55 4374 1560 1 (0.35665295 0.64334705)
## 26) lat< 23.14866 580 30 0 (0.94827586 0.05172414) *
## 27) lat>=23.14866 3794 1010 1 (0.26620980 0.73379020) *
## 7) lon< -116.1268 3557 475 1 (0.13353950 0.86646050) *
rpart.plot(mdl_dt2)
# plot complexity parameter
plotcp(mdl_dt2)
# rpart cross validation results
mdl_dt2$cptable
## CP nsplit rel error xerror xstd
## 1 0.32976589 0 1.0000000 1.0248829 0.008176363
## 2 0.09337793 1 0.6702341 0.6702341 0.007721249
## 3 0.06956522 3 0.4834783 0.4880268 0.007025512
## 4 0.01000000 4 0.4139130 0.4172575 0.006646461
# caret cross validation results
mdl_caret <- train(present ~ .,
data = d_train,
method = "rpart",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 20)
ggplot(mdl_caret)
Question 5: Based on the complexity plot threshold, what size of tree is recommended?
vip(mdl_caret, num_features = 40, bar = FALSE)
Question 6: what are the top 3 most important variables of your model?
# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>%
plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
colorkey = TRUE, screen = list(z = -20, x = -60))
# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)
# number of features
n_features <- length(setdiff(names(d_train), "present"))
# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)
# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.2921395
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(present ~ .,
data = d_train,
importance = "impurity")
# re-run model with permutation-based variable importance
mdl_permutation <- ranger(present ~ .,
data = d_train,
importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)
gridExtra::grid.arrange(p1, p2, nrow = 1)
Question 7: How might variable importance differ between rpart and RandomForest in your model outputs?